April 12, 2019
print("Hello, World!")
## [1] "Hello, World!"
?mean
str(iris)
## 'data.frame': 150 obs. of 5 variables: ## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... ## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... ## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... ## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... ## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
install.packages("tidyverse")
library(tidyverse)
devtools::install_github("hadley/tidyverse")
library(tidyverse) count(iris, Species)
# ctrl + shift + m shortcut in RStudio iris %>% count(Species)
## # A tibble: 3 x 2 ## Species n ## <fct> <int> ## 1 setosa 50 ## 2 versicolor 50 ## 3 virginica 50
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
c(1, 2, 3, 4, 5)
## [1] 1 2 3 4 5
# alt + - shortcut in RStudio x <- 1:10 x
## [1] 1 2 3 4 5 6 7 8 9 10
str(iris)
## 'data.frame': 150 obs. of 5 variables: ## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... ## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... ## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... ## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... ## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
typeof(iris$Sepal.Length)
## [1] "double"
class(iris$Sepal.Length)
## [1] "numeric"
is.*() and as.*() functionsas.character(x)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
as.numeric(c(TRUE, FALSE))
## [1] 1 0
is.double(1.25)
## [1] TRUE
x[3] <- NA; x
## [1] 1 2 NA 4 5 6 7 8 9 10
is.na(x)
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
mean(is.na(x))
## [1] 0.1
mean(x, na.rm = TRUE)
## [1] 5.777778
# good code includes comments # comment the why, not the what # ctrl + shift + c shortcut in RStudio
fit <- aov(Sepal.Length ~ Species, data = iris) summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F) ## Species 2 63.21 31.606 119.3 <2e-16 *** ## Residuals 147 38.96 0.265 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(fit)
## Tukey multiple comparisons of means ## 95% family-wise confidence level ## ## Fit: aov(formula = Sepal.Length ~ Species, data = iris) ## ## $Species ## diff lwr upr p adj ## versicolor-setosa 0.930 0.6862273 1.1737727 0 ## virginica-setosa 1.582 1.3382273 1.8257727 0 ## virginica-versicolor 0.652 0.4082273 0.8957727 0
T
## [1] TRUE
T <- F T
## [1] FALSE
"The tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures."
library(tidyverse)
library(dplyr)
filter()arrange()select()rename()mutate()summarize()group_by()ntile()sample_n()inner_join()count()iris %>%
group_by(Species) %>%
summarize(mean_sl = mean(Sepal.Length),
sd_sl = sd(Sepal.Length),
n = n())
## # A tibble: 3 x 4 ## Species mean_sl sd_sl n ## <fct> <dbl> <dbl> <int> ## 1 setosa 5.01 0.352 50 ## 2 versicolor 5.94 0.516 50 ## 3 virginica 6.59 0.636 50
library(ggplot2)
ggplot has 3 elements:
aes() mapping inherited by downstream code# basic structure ggplot(df, aes()) + geom_*()
xyalphacolorfillshapesizelinetypegeom_col()geom_bar()geom_histogram()geom_density()geom_density2d()geom_smooth()geom_point()geom_line()geom_boxplot()geom_raster()geom_text()ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
ggplot(diamonds, aes(x = carat, y = price, color = color)) + geom_point()
ggplot(diamonds, aes(x = carat, y = price, color = color)) + geom_point() + facet_wrap(~cut)
ggplot(diamonds, aes(x = carat, y = price, color = color)) + geom_smooth() + facet_wrap(~cut)
ggplot(diamonds, aes(x = carat, y = price, color = color)) +
geom_point() +
facet_grid(clarity~cut) +
labs(x = "Carat",
y = "Price ($)",
color = "Color",
title = "Diamond Price by the 4 C's",
subtitle = "Carat, Color, Cut, & Clarity") +
theme_minimal()
library(tidyr)
gather()spread()nest()separate()replace_na()tibble(norm_1 = rnorm(1000, mean = 0, sd = 1),
norm_2 = rnorm(1000, mean = 0, sd = 2),
norm_3 = rnorm(1000, mean = 0, sd = 3))
## # A tibble: 1,000 x 3 ## norm_1 norm_2 norm_3 ## <dbl> <dbl> <dbl> ## 1 1.33 1.56 -2.12 ## 2 -0.964 -0.543 0.993 ## 3 0.540 0.193 1.88 ## 4 0.493 -1.79 -0.773 ## 5 -0.0258 -4.58 3.32 ## 6 1.04 0.880 5.48 ## 7 0.305 -2.98 -2.46 ## 8 0.432 0.874 -1.69 ## 9 0.480 1.24 0.125 ## 10 0.390 0.0408 1.99 ## # ... with 990 more rows
tibble(norm_1 = rnorm(1000, mean = 0, sd = 1),
norm_2 = rnorm(1000, mean = 0, sd = 2),
norm_3 = rnorm(1000, mean = 0, sd = 3)) %>%
gather(dist, value)
## # A tibble: 3,000 x 2 ## dist value ## <chr> <dbl> ## 1 norm_1 -0.0632 ## 2 norm_1 -0.487 ## 3 norm_1 1.48 ## 4 norm_1 -0.673 ## 5 norm_1 0.724 ## 6 norm_1 0.911 ## 7 norm_1 -0.00545 ## 8 norm_1 0.739 ## 9 norm_1 -0.0564 ## 10 norm_1 1.87 ## # ... with 2,990 more rows
tibble(norm_1 = rnorm(1000, mean = 0, sd = 1),
norm_2 = rnorm(1000, mean = 0, sd = 2),
norm_3 = rnorm(1000, mean = 0, sd = 3)) %>%
gather(dist, value) %>%
ggplot(aes(x = value, fill = dist)) +
geom_density(alpha = 0.6) +
labs(x = "Value", y = "Density", fill = "Distribution") +
scale_x_continuous(breaks = seq(-20, 20, 2.5)) +
scale_fill_viridis_d() +
theme_minimal()
You may have heard a rumour that gather/spread are going away. This is simply not true (they'll stay around forever) but I am working on better replacements which you can learn about at https://t.co/sU2GzWeBaf. Now is a great time for feedback! #rstats
— Hadley Wickham (@hadleywickham) March 19, 2019
library(broom)
tidy()glance()augment()fit <- lm(price ~ carat, data = diamonds) summary(fit)
## ## Call: ## lm(formula = price ~ carat, data = diamonds) ## ## Residuals: ## Min 1Q Median 3Q Max ## -18585.3 -804.8 -18.9 537.4 12731.7 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -2256.36 13.06 -172.8 <2e-16 *** ## carat 7756.43 14.07 551.4 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1549 on 53938 degrees of freedom ## Multiple R-squared: 0.8493, Adjusted R-squared: 0.8493 ## F-statistic: 3.041e+05 on 1 and 53938 DF, p-value: < 2.2e-16
tidy() & glance()tidy(fit)
## # A tibble: 2 x 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) -2256. 13.1 -173. 0 ## 2 carat 7756. 14.1 551. 0
glance(fit)
## # A tibble: 1 x 11 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> ## 1 0.849 0.849 1549. 304051. 0 2 -4.73e5 9.45e5 ## # ... with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>
augment()augment(fit)
## # A tibble: 53,940 x 9 ## price carat .fitted .se.fit .resid .hat .sigma .cooksd .std.resid ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 326 0.23 -472. 10.4 798. 0.0000452 1549. 6.00e-6 0.516 ## 2 326 0.21 -628. 10.6 954. 0.0000471 1549. 8.92e-6 0.616 ## 3 327 0.23 -472. 10.4 799. 0.0000452 1549. 6.02e-6 0.516 ## 4 334 0.290 -7.00 9.77 341. 0.0000398 1549. 9.66e-7 0.220 ## 5 335 0.31 148. 9.57 187. 0.0000382 1549. 2.78e-7 0.121 ## 6 336 0.24 -395. 10.3 731. 0.0000442 1549. 4.93e-6 0.472 ## 7 336 0.24 -395. 10.3 731. 0.0000442 1549. 4.93e-6 0.472 ## 8 337 0.26 -240. 10.1 577. 0.0000424 1549. 2.94e-6 0.372 ## 9 337 0.22 -550. 10.5 887. 0.0000461 1549. 7.56e-6 0.573 ## 10 338 0.23 -472. 10.4 810. 0.0000452 1549. 6.18e-6 0.523 ## # ... with 53,930 more rows
for() loops with better performance and increased transparency in codeapply() family of functionsmap()map2()pmap()| Function | Returns |
|---|---|
| map | list |
| map_chr | character vector |
| map_dbl | double (numeric) vector |
| map_dfc | data frame (column bind) |
| map_dfr | data frame (row bind) |
| map_int | integer vector |
| map_lgl | logical vector |
| walk | invisible |
# imagine data.frame object d already exists and we're creating new, random variables d$var1 <- rnorm(n = 1000, mean = 1) d$var2 <- rnorm(n = 1000, mean = 2) d$var3 <- rnorm(n = 1000, mean = 3) d$var3 <- rnorm(n = 1000, mean = 4) d$var5 <- rnorm(n = 1000, mean = 5) d$var6 <- rnorm(n = 1000, mean = 6) d$var7 <- rnorm(n = 1000, mean = 7) d$var8 <- rnorm(n = 1000, mean = 8) d$var9 <- rnorm(n = 1000, mean = 9) d$var10 <- rnorm(n = 1000, mean = 10)
map()map_dfc(1:10, rnorm, n = 1000)
## # A tibble: 1,000 x 10 ## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2.44 0.714 3.00 2.61 3.50 5.35 7.06 7.66 8.31 9.98 ## 2 -0.333 2.55 3.81 4.89 4.99 4.44 7.22 5.51 9.79 9.71 ## 3 1.36 2.32 2.77 4.11 5.93 5.38 8.69 7.81 6.67 9.32 ## 4 0.854 3.96 1.95 4.12 6.05 6.21 6.30 8.24 8.40 8.27 ## 5 -0.0318 1.89 1.14 3.58 5.13 4.08 8.02 7.45 10.4 8.52 ## 6 2.80 1.76 2.91 3.06 5.16 7.38 5.78 7.96 8.79 11.3 ## 7 0.141 1.62 2.76 4.86 4.45 5.95 6.53 8.07 10.1 11.0 ## 8 1.63 1.02 3.65 3.58 3.91 4.96 7.17 8.31 9.18 9.77 ## 9 0.609 3.01 3.17 4.76 6.48 6.88 6.86 7.41 10.0 10.6 ## 10 0.517 2.34 3.69 4.58 3.86 5.93 7.01 6.32 9.11 8.45 ## # ... with 990 more rows
rmarkdown and shinylibrary(plotly)
ggplotly()plot_ly()add_trace()layout()ggplot2 and wrap it in a ggplotly() functionp <-
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot()
ggplotly(p)
plot_ly(iris, x = ~Sepal.Length, y = ~Sepal.Width,
text = ~Species, color = ~Species, colors = "Set1") %>%
add_trace(type = 'scatter', mode = 'markers') %>%
layout(xaxis = list(title = "Sepal Length"),
yaxis = list(title = "Sepal Width"),
title = "Iris Data")
library(gapminder)
gapminder %>%
plot_ly(
x = ~gdpPercap,
y = ~lifeExp,
size = ~pop,
color = ~continent,
colors = "Set1",
frame = ~year,
text = ~country,
hoverinfo = "text",
type = "scatter",
mode = "markers"
) %>%
layout(
xaxis = list(
type = "log"
)
)
library(lubridate)
ymd(20190412)
## [1] "2019-04-12"
mdy("April 12th, 2019")
## [1] "2019-04-12"
now()
## [1] "2019-04-10 15:28:32 CDT"
year(now())
## [1] 2019
month(now(), label = TRUE, abbr = FALSE)
## [1] April ## 12 Levels: January < February < March < April < May < June < ... < December
ymd(20190330) - months(1)
## [1] NA
ymd(20190330) %m-% months(1)
## [1] "2019-02-28"
tidyversegrep() family of functionslibrary(stringr)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" ## [5] "Species"
iris %>% as_tibble() %>% rename_all(funs(str_to_lower(str_replace_all(., "\\.", "_")))) %>% head()
## # A tibble: 6 x 5 ## sepal_length sepal_width petal_length petal_width species ## <dbl> <dbl> <dbl> <dbl> <fct> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa
library(forcats)
I feel like I make this plot every day: count, fct_reorder, geom_col, coord_flip @robinson_es #rstatsnyc pic.twitter.com/2jdlJRWdBe
— David Robinson (@drob) April 20, 2018
diamonds %>%
count(cut) %>%
mutate(cut = fct_reorder(cut, n)) %>%
ggplot(aes(x = cut, y = n)) +
geom_col() +
coord_flip()
https://rmarkdown.rstudio.com/
library(rmarkdown)
names(knitr::knit_engines$get())
## [1] "awk" "bash" "coffee" "gawk" "groovy" ## [6] "haskell" "lein" "mysql" "node" "octave" ## [11] "perl" "psql" "Rscript" "ruby" "sas" ## [16] "scala" "sed" "sh" "stata" "zsh" ## [21] "highlight" "Rcpp" "tikz" "dot" "c" ## [26] "fortran" "fortran95" "asy" "cat" "asis" ## [31] "stan" "block" "block2" "js" "css" ## [36] "sql" "go" "python" "julia"
To apply or learn more visit www.colonybrands.com
justin.marschall@imsdm.com
jcmarschall
justinmarschall
justinmarschall
data()